{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Vllm Setup (Using GLM-4 as an example)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1\n",
    "install necessary dependency, especially [vllm](https://pypi.org/project/vllm/) and [transformers](https://pypi.org/project/transformers/)\n",
    "\n",
    "using python 3.11+ and conda environment is recommended\n",
    "\n",
    "```sh\n",
    "conda create -n vllm python=3.11\n",
    "pip install -r requirements.txt & pip install vllm transformers\n",
    "```\n",
    "\n",
    "### Step 2\n",
    "By default, vLLM downloads model from HuggingFace. If you would like to use models from ModelScope in the following examples, please set the environment variable:\n",
    "\n",
    "```sh\n",
    "export VLLM_USE_MODELSCOPE=True\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 3 \n",
    "use vLLM for offline batched inference of GLM-4-9B-Chat and run it in our modelscope-agent!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-06-07 17:19:20,884 - modelscope - INFO - PyTorch version 2.3.0 Found.\n",
      "2024-06-07 17:19:20,887 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer\n",
      "2024-06-07 17:19:21,116 - modelscope - INFO - No valid ast index found from /mnt/workspace/.cache/modelscope/ast_indexer, generating ast index from prebuilt!\n",
      "2024-06-07 17:19:21,181 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 f25c671d28d2d28625a947bf48aaa515 and a total number of 964 components indexed\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING 06-07 17:19:23 tokenizer.py:126] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.\n",
      "INFO 06-07 17:19:28 model_runner.py:146] Loading model weights took 17.5635 GB\n",
      "INFO 06-07 17:19:38 gpu_executor.py:83] # GPU blocks: 57488, # CPU blocks: 6553\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.36it/s, Generation Speed: 61.43 toks/s]\n",
      "2024-06-07 17:19:40.993 - modelscope-agent - INFO -  | message: call vllm success, output: \n",
      "哈哈，嘿兄弟，诸葛同学，啥事呀？我这几天连轴转，忙得脚打后脑勺，啥风把你给吹来了？说吧，是不是又有啥新鲜事啊？\n",
      "2024-06-07 17:19:40.994 - modelscope-agent - INFO -  | message: call llm 1 times output: \n",
      "哈哈，嘿兄弟，诸葛同学，啥事呀？我这几天连轴转，忙得脚打后脑勺，啥风把你给吹来了？说吧，是不是又有啥新鲜事啊？\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "哈哈，嘿兄弟，诸葛同学，啥事呀？我这几天连轴转，忙得脚打后脑勺，啥风把你给吹来了？说吧，是不是又有啥新鲜事啊？\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "sys.path.insert(0, '/your/path/to/modelscope_agent')\n",
    "from modelscope_agent.agents.role_play import RolePlay \n",
    "from vllm import LLM, SamplingParams\n",
    "from transformers import AutoTokenizer\n",
    "model_name='THUDM/glm-4-9b-chat'\n",
    "\n",
    "# you can also use local model by setting the \"model_name\" as the llm local path\n",
    "# model_name=\"/path/to/model/glm-4-9b-chat\" \n",
    "\n",
    "# create an vllm.LLM instance and set up the tokenizer and sampling_params\n",
    "# detailed information can be found in https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "llm = LLM(\n",
    "    model=model_name,\n",
    "    tensor_parallel_size=1,\n",
    "    max_model_len=131072,\n",
    "    trust_remote_code=True,\n",
    "    enforce_eager=True,\n",
    ")\n",
    "sampling_params = SamplingParams(temperature=0.95, max_tokens=1024, stop_token_ids=[151329, 151336, 151338])\n",
    "\n",
    "\n",
    "# setting up a llm config\n",
    "llm_config = {'model': 'THUDM/glm-4-9b-chat',\n",
    "              'model_server':'vllm',\n",
    "              'tokenizer':tokenizer,\n",
    "              'llm':llm,\n",
    "              'sampling_params':sampling_params,\n",
    "              'stream':True}\n",
    "function_list = []\n",
    "role_template = '你是一个agent小助手，你需要根据用户的要求来回答他们的问题'\n",
    "bot = RolePlay(function_list=function_list,llm=llm_config, instruction=role_template)\n",
    "\n",
    "response = bot.run('你好，请以李云龙的语气和我对话')\n",
    "text = ''\n",
    "for chunk in response:\n",
    "    text += chunk\n",
    "print(text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "glm4",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
